#!/usr/bin/python
# -*- coding: ISO-8859-1 -*-
from __future__ import division
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

###########################
### Autor: Sebastian Enger / M.Sc.
### Copyright: Sebastian Enger
### Licence: Commercial / OneTipp
### Version: 1.0.7  - 17-10-2015@23:53 Uhr
### Contact: sebastian.enger@gmail.com
### OneTipp Text Tool in Python
###########################


#https://docs.python.org/2/library/configparser.html


######## export PYTHON_EGG_CACHE=/tmp
import pprint
import os
import nltk
# import rocksdb                                            # shared library kann aktuell noch nicht gelesen werden
import MySQLdb                                              # apt-get install python-mysqldb
from sphinxit.core.processor import Search                  # http://sphinxit.readthedocs.org/en/latest/
from sphinxit.core.helpers import BaseSearchConfig
from random import randint
from past.builtins import basestring                        # pip install future
import codecs
import sys
from sumy.parsers.plaintext import PlaintextParser          # https://github.com/miso-belica/sumy
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
import re
from transliterate import translit, get_available_language_codes
import libleipzig
import pprint
import json
from textstat.textstat import textstat                      # https://pypi.python.org/pypi/textstat

os.environ['PYTHON_EGG_CACHE'] = '/home/compress/'

###python -m nltk.downloader -d /usr/share/nltk_data all
####python -m nltk.downloader all
###########nltk.download()
# nltk.download("punkt")

reload(sys)
sys.setdefaultencoding('utf-8')
noDoubleHash    = set()
###re_match        = r"[(\?|\.|\!)][(\t|\r|\n|\s|\w){0,}]([A-Za-z0-9]{1,})" # Match: ". WORT"
re_match        = r"(\?|\.|\!)$" # Match: ". WORT"


# lies die Ein und Ausgabedateien
inputfile   = sys.argv[1]
outputfile  = sys.argv[2]

# http://www.tutorialspoint.com/python/python_command_line_arguments.htm
# read file into string
text = open(inputfile, 'r').read()
#text.decode('utf-8')
text = text.decode("utf-8")


class SphinxitConfig(BaseSearchConfig):
    DEBUG = False
    WITH_META = False
    WITH_STATUS = False
    POOL_SIZE = 5
    # SQL_ENGINE = 'oursql'
    SEARCHD_CONNECTION = {
        'host': '127.0.0.1',
        'port': 9977,
    }


# delimiters      = ['\n', ' ', ',', '.', '?', '!', ':', ';', '\s', '\t', '\r']
# http://pyrocksdb.readthedocs.org/en/v0.4/tutorial/index.html
# https://github.com/sphinxsearch/sphinx/blob/master/api/sphinxapi.py
# http://www.tutorialspoint.com/python/python_database_access.htm
# mysql = MySQLdb.connect("localhost","root","###########99","onetipp" ) # last working
sphinx = MySQLdb.connect(
    host='127.0.0.1',
    user='root',
    passwd='###########99',
    db='onetipp',
    port=9977)  # sphinxQL

cursorSphinx = sphinx.cursor()

mysql = MySQLdb.connect(
    host='127.0.0.1',
    user='root',
    passwd='###########99',
    db='onetipp',
    port=3306)  # Mysql

mysql.autocommit(True)
cursorMysql = mysql.cursor()

def log_warnings(curs):
    for msg in curs.messages:
        if msg[0] == MySQLdb.Warning:
            logging.warn(msg[1])


def deumlaut(s):
    """
    Replaces umlauts with fake-umlauts
    """

    s = s.replace('\xdf', 'ss')
    s = s.replace('\xfc', '&uuml;')
    s = s.replace('\xdc', '&Uuml;')
    s = s.replace('\xf6', '&ouml;')
    s = s.replace('\xd6', '&Ouml;')
    s = s.replace('\xe4', '&auml;')
    s = s.replace('\xc4', '&Auml;')

    #s = s.replace('\xdf', 'ss')
    #s = s.replace('\xfc', 'ue')
    #s = s.replace('\xdc', 'Ue')
    #s = s.replace('\xf6', 'oe')
    #s = s.replace('\xd6', 'Oe')
    #s = s.replace('\xe4', 'ae')
   # s = s.replace('\xc4', 'Ae')
    return s


def summarizeText(s):
    ## sumy: https://github.com/miso-belica/sumy/tree/dev/sumy/summarizers

    sentences           = nltk.sent_tokenize(s)
    sentenceCount       = len(sentences)
    randSentenceCount   = randint(int((sentenceCount/100)*90)+1, sentenceCount)
    # randCount = random.randint(iround(float((sentenceCount / 100) * 55)), iround(sentenceCount))

    parser              = PlaintextParser.from_string(s, Tokenizer("german"))
    stemmer             = Stemmer("german")
    # summarizer             = TextRankSummarizer(stemmer)
    summarizer          = Summarizer(stemmer)
    summary             = summarizer(parser.document, randSentenceCount)

    returnText = ""

#ISO-8859-1
    for sentence in summary:
        returnText += str(sentence)
        returnText += " "

    return returnText

    # Todos:
    # create a stopword list in German
    # if a stopword is part of a synonym
    # give bad minus points


def SynRanker(s,t):
    if not s or not t:
        return -10
    else:
        1

    if not isinstance(s, basestring) or not isinstance(t, basestring):
        return -10
    else:
        1

    startVal        = float(1.0)

    lenSyn          = len(s)
    synHasDigits    = any(i.isdigit() for i in s)
    synhasSonder    = False

    delimiters  = ['\n', ' ', ',', '.', '?', '!', ':', ';', '\s', '\t', '\r']
    re_sonder   = r"(\?|\.|\,|\;|\:|\!|\d)"
    re_space    = r"(\t|\r|\n|\s|\w)"

    firstS      = s[0:1]
    firstT      = t[0:1]

    if s == t:
        startVal -= -0.95
        return -1
    else:
        1

    if lenSyn <= 0:
        startVal -= -0.99
        return -10
    else:
        1

    if lenSyn > 3 and lenSyn < 14:
        startVal += 0
    elif lenSyn <= 3:
        startVal -= 0.35
    else:
        1

    if (' ' in s) and lenSyn >= 14:
        startVal -= 0.75
    elif (' ' in s) and lenSyn < 14:
        startVal -= 0.55
    elif (' ' not in s) and lenSyn >= 14:
        startVal -= 0.05
    elif (' ' not in s) and lenSyn < 14:
        startVal += 0.05
    else:
        1

    if re.search(re_space, s) is not None:
        startVal -= 0.50
    else:
        1

    if re.search(re_sonder, s) is not None:
        startVal -= 0.075
        synhasSonder = True
    else:
        1

    if firstS.isupper() and firstT.isupper():
        startVal += 0.15
    elif firstS.islower() and firstT.islower():
        startVal += 0.15
    elif firstS.isupper() and not firstT.isupper():
        startVal -= 0.25
    elif firstS.islower() and not firstT.islower():
        startVal -= 0.25
    else:
        1

    #print("Synonym: ", s)
    #print("<br>")
    #print("Length: ", lenSyn)
    #print("<br>")
   # print("Digits: ", synHasDigits)
    #print("<br>")
    #print("Space: ", (' ' in s))
    #print("<br>")
    #print("Sonderzeichen: ", synhasSonder)
    #print("<br>")
    #print("SynRank: ", startVal)
    #print("<br>")
    #print("---------------------------------------------------<br>")
    # later ResultCodes
    return float(startVal)


def iround(x):
    """iround(number) -> integer
    Round a number to the nearest integer."""
    return int(round(x) - .5) + (x > 0)

def getSynLeipzig(sl):

    #print ("Auto Syn - Leipzig: ", libleipzig.Thesaurus("Auto",10))
    retContent = []
    retSaveMysql = "W:"+sl

    if not sl:
        return retContent
    elif not isinstance(sl, basestring):
        return retContent
    elif len(sl) < 3:
        return retContent

    synLeipzig = libleipzig.Thesaurus(sl, 150)

    if not synLeipzig:
        return retContent
    else:
        for aSyn in synLeipzig:
            retContent.append(str(aSyn[0]))
            retSaveMysql += ";S:"+(str(aSyn[0]))

    if len(retSaveMysql) > 5:
        raw = json.dumps(retSaveMysql)
        loggit = "INSERT INTO synonym_leipzig(raw,uid) VALUES(%s, %s)"

        try:
            cursorMysql.execute(loggit, (raw, 0))
            mysql.commit()
        except MySQLdb.ProgrammingError:
            print("Function -getSynLeipzig()- failed: The following mysql query failed:")
            print(loggit)

        data = []

    return retContent

# sent_tokenize_list = sent_tokenize(text)
# Summarize the text first and then work on it
tSumy       = summarizeText(text)
tokens      = nltk.word_tokenize(tSumy)
tokensRaw   = nltk.word_tokenize(text)

count                   = -1
changeEveryWord         = 8 #Leistungsschutzrecht: 7 Zeichen dürfen genutzt werden, darüber muss geändert werden
changeEveryWordFlag     = 0
changeEveryWordTemp     = 0 #temporary upcount

for word in tokens:

    count += 1

    wordTemp = word.encode('ascii', 'ignore')
    # cursorMysql.execute("SELECT * FROM (namen_table) WHERE name LIKE '%s%%' LIMIT 1;" % (word))
    cursorMysql.execute("SELECT * FROM (namen_table) WHERE BINARY `name` = '%s' LIMIT 1;" % (wordTemp))
    name_content = cursorMysql.fetchone()
    #print ("SELECT * FROM (namen_table) WHERE name LIKE '%s' LIMIT 1;" % (word))
    #print (name_content)

    #    search_query = Search(indexes=['onetipp_name'], config=SphinxitConfig)
    #    # search_query = search_query.match(word).options(
    #    search_query = search_query.match(word).options(
    #        ranker='proximity_bm25',
    #        max_matches=1,
    #        max_query_time=350,
    #        field_weights={'name': 100, 'gender': -10000, 'language': -10000, 'meaning': -10000},
    #    )
        ###sphinx_result = search_query.ask()
        # exit(0)

    # es wurde ein namen gefunden -> kein synonym austauschen
    if name_content is not None:
        # print("Token: ", tokens)
        #print("Count: ", count)
        #print("<br>")
        #print("Tokencount overall: ", len(tokens))
        #print("<br>")
     #   tokens[count] = '<b style="color:#00FFFF;" title="Namen erkannt"><i>' + deumlaut(word) + '</i></b>'
        tokens[count] = deumlaut(word)
        tokensRaw[count] = deumlaut(word)
        # print "Namen erkannt und nicht getauscht"
        continue
    else:
        1

    if changeEveryWordTemp == (changeEveryWord - 1):
        changeEveryWordFlag     = 0
        changeEveryWordTemp     = 0
    else:
        1

    if changeEveryWordFlag == 1:
        changeEveryWordTemp += 1
    else:
        1

    if len(word) >=4  and changeEveryWordFlag == 0:

        # Versuche zuerst die Leipzig DB anzufordern

        lstcWord        = word[0:1]
        synDictLeipzig  = {}
        sLeipzigList    = getSynLeipzig(word)

        if sLeipzigList:
            for wSynL in sLeipzigList:
                #synDict[SynRanker(wSyn, word)] = wSyn
                if wSynL not in noDoubleHash:
                    synDictLeipzig[wSynL] = SynRanker(wSynL, word)

            sortedSynList       = []
            sortedSynList       = sorted(synDictLeipzig.items(), key=lambda x: x[1], reverse=True)
            firstBestSynHit     = str(sortedSynList[0][0])
            firstBestSynHitRank = str(sortedSynList[0][1])

            # Hat das letzte Wort ein Satzendenzeichen, schreibe das aktuell gleich mal gross
            if re.search(re_match, tokens[count-1]) is not None:
                firstBestSynHit.title()

            if word.endswith('.'):
                firstBestSynHit += '.'
            elif word.endswith('?'):
                firstBestSynHit += '?'
            elif word.endswith('!'):
                firstBestSynHit += '!'
            elif word.endswith(','):
                firstBestSynHit += ','
            elif word.endswith(';'):
                firstBestSynHit += ';'
            elif word.endswith(':'):
                firstBestSynHit += ':'

            # later: Randomly choose one of the synonyms that have all the highest rating
     #       tokens[count] = '<b style="color:#FF99FF; text-decoration: underline" title="SynRank(' + \
     #                       firstBestSynHitRank + ') /Leipzig DB/ ->Synonym ausgetauscht!"><i>' + deumlaut(
     # firstBestSynHit) + '</i></b>'

            tokens[count] = deumlaut(firstBestSynHit)

            noDoubleHash.add(firstBestSynHit)
            tokensRaw[count]    = deumlaut(firstBestSynHit)
            changeEveryWordFlag = 1
            changeEveryWordTemp += 1

        else:

            #nutze unsere lokale Synonym Mysql Datenbank
            search_query_syn = Search(indexes=['onetipp_syn_simple'], config=SphinxitConfig)
            search_query_syn = search_query_syn.match(word).options(
                ranker='proximity_bm25',
                max_matches=1,
                max_query_time=350,
                field_weights={'synonyms': 100},
            )
            sphinx_result_syn = search_query_syn.ask()
            synID = 0

            try:
                synID = sphinx_result_syn['result']['items'][0].values()[0]
                if synID > 0:
                    #    print "SynDB has been found: ", synID

                    #später finde via sphinx noch mehr synonyme und parse diese alle
                    sql         = "SELECT synonyms FROM (synonym_unique_simple) WHERE uid= %s" % (synID)
                    cursorMysql.execute(sql)
                    syn_content = cursorMysql.fetchone()
                    synContent  = list(syn_content)
                    synContent  = synContent[0].decode(encoding="utf-8", errors="ignore")

                    if syn_content:
                        synwords = synContent.split(";")
                        # print SynDictCalculator(synwords)

                        #    http://www.saltycrane.com/blog/2007/09/how-to-sort-python-dictionary-by-keys/
                        #    for key, value in sorted(mydict.iteritems(), key=lambda (k,v): (v,k)):
                        #       print "%s: %s" % (key, value)

                        synDict = {}
                        for wSyn in synwords:
                            #synDict[SynRanker(wSyn, word)] = wSyn
                            if wSyn not in noDoubleHash:
                                synDict[wSyn] = SynRanker(wSyn, word)

                        sortedSynList       = []
                        sortedSynList       = sorted(synDict.items(), key=lambda x: x[1], reverse=True)
                        firstBestSynHit     = str(sortedSynList[0][0])
                        firstBestSynHitRank = str(sortedSynList[0][1])

                        # Hat das letzte Wort ein Satzendenzeichen, schreibe das aktuell gleich mal gross
                        if re.search(re_match, tokens[count-1]) is not None:
                            firstBestSynHit.title()

                        if word.endswith('.'):
                            firstBestSynHit += '.'
                        elif word.endswith('?'):
                            firstBestSynHit += '?'
                        elif word.endswith('!'):
                            firstBestSynHit += '!'
                        elif word.endswith(','):
                            firstBestSynHit += ','
                        elif word.endswith(';'):
                            firstBestSynHit += ';'
                        elif word.endswith(':'):
                            firstBestSynHit += ':'

                        # later: Randomly choose one of the synonyms that have all the highest rating
                       # tokens[count]       = '<b style="color:#FF99FF; text-decoration: underline" title="SynRank(' \
                        #                      + firstBestSynHitRank + ') /LocalMysqL DB/ ->Synonym
                        # ausgetauscht!"><i>' + deumlaut(firstBestSynHit) + '</i></b>'

                        tokens[count] = deumlaut(firstBestSynHit)

                        noDoubleHash.add(firstBestSynHit)
                        tokensRaw[count] = deumlaut(firstBestSynHit)
                        changeEveryWordFlag = 1
                        changeEveryWordTemp += 1
                        #break

            except IndexError:
                1

# file schreiben
outputtext          = ' '.join(tokens)
outputtextRaw       = ' '.join(tokensRaw)

readabilityVar      = str(textstat.flesch_reading_ease(outputtextRaw))

with codecs.open(outputfile, 'w') as f:
    f.write(outputtext )
   # f.write("<span title=\"Flesch Reading Ease: (Grosser Wert=Einfacher zu lesen ### geringer Wert=Schwerer zu
   # "lesen)\">Lesbarkeitswert : </span>" + readabilityVar)
    #f.write("<br><br>")
    #f.write(outputtext)
    #f.write("<br><br>")
    #f.write("RUSSISCHE TRANSLITERATION: BEISPIEL VERSION")
    #f.write("<br><br>")
    #f.write(translit(outputtextRaw, 'ru'))
    f.close()

mysql.commit()
mysql.close()
exit(0)


"""
The Flesch Reading Ease formula

function name - flesch_reading_ease(text)

returns the Flesch Reading Ease Score. Following table is helpful to access the ease of readability in a document.

90-100 : Very Easy
80-89 : Easy
70-79 : Fairly Easy
60-69 : Standard
50-59 : Fairly Difficult
30-49 : Difficult
0-29 : Very Confusing

"""